{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%pylab inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "from __future__ import division\n",
    "from IPython.display import display, HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import pandas as pd\n",
    "import MySQLdb as mdb\n",
    "import bs4\n",
    "import datetime\n",
    "from collections import defaultdict\n",
    "from matplotlib import pyplot as plt\n",
    "from ipywidgets import widgets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def table_to_dataframe(name, connection):\n",
    "    return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)\n",
    "\n",
    "def project_table_to_dataframe(name, connection):\n",
    "    # project_id 1 is monorail\n",
    "    return pd.read_sql(\"SELECT * FROM {} where project_id = 1;\".format(name) , con=connection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "connection = mdb.connect(host=\"localhost\", user=\"root\", db=\"monorail\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cursor = connection.cursor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Only look at monorail issues, and only look at issues opened in the past year.\n",
    "issue = pd.read_sql(\"SELECT * FROM Issue where project_id = 1 and opened > 1436396241;\", con=connection)\n",
    "comment = pd.read_sql(\"SELECT * FROM Comment where project_id = 1 and created > 1436396241;\", con=connection)\n",
    "status_def = project_table_to_dataframe(\"StatusDef\", connection)\n",
    "issue_summarny = table_to_dataframe(\"IssueSummary\", connection)\n",
    "issue_label = table_to_dataframe(\"Issue2Label\", connection)\n",
    "issue_component = table_to_dataframe(\"Issue2Component\", connection)\n",
    "issue_update = table_to_dataframe(\"IssueUpdate\", connection)\n",
    "issue.rename(columns={\"id\":\"issue_id\"}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Number of Issues\", issue.shape[0])\n",
    "print(\"Number of IssueUpdates\", issue_update.shape[0])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Associate IssueUpdates with their Issues\n",
    "This next step is resource intensive and can take a while."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "updates_by_issue = defaultdict(list)\n",
    "i = 0\n",
    "for index, row in issue_update.iterrows():\n",
    "    updates_by_issue[row[\"issue_id\"]].append(row)\n",
    "    if i % 1000000 == 0:\n",
    "        print(i)\n",
    "    i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "issues_by_id = {}\n",
    "i = 0\n",
    "for index, row in issue.iterrows():\n",
    "    issues_by_id[row[\"issue_id\"]] = row\n",
    "    if i % 1000000 == 0:\n",
    "        print(i)\n",
    "    i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "status_by_id = {}\n",
    "i = 0\n",
    "for index, row in status_def.iterrows():\n",
    "    status_by_id[row[\"id\"]] = row\n",
    "    if i % 1000000 == 0:\n",
    "        print(i)\n",
    "    i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "issue[\"updates\"] = issue[\"issue_id\"].apply(lambda i_id: [u for u in sorted(updates_by_issue[i_id], key=lambda x: x.id)])\n",
    "issue[\"num_updates\"] = issue[\"updates\"].apply(lambda updates: len(updates))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sns.distplot(issue[\"num_updates\"], kde=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def StatusPath(i_id, updates):\n",
    "    statuses = []\n",
    "    for update in updates:\n",
    "        if update.field == 'status':\n",
    "            if len(statuses) == 0:\n",
    "                statuses.append(update.old_value if update.old_value else 'none')\n",
    "            statuses.append(update.new_value if update.new_value else 'none')\n",
    "\n",
    "    if len(statuses) == 0:\n",
    "        # use ~np.isnan here instead?\n",
    "        if issues_by_id[i_id].status_id == issues_by_id[i_id].status_id: # cheap NaN hack\n",
    "            status_id = int(issues_by_id[i_id].status_id)\n",
    "            if status_id is not NaN and status_id in status_by_id:\n",
    "                statuses = [status_by_id[status_id].status]\n",
    "            else:\n",
    "                statuses = ['mystery status id: %d' % status_id]\n",
    "        else:\n",
    "            statuses = ['never had status']\n",
    "    statuses = [s.decode('utf-8', errors='replace') for s in statuses]\n",
    "    return u'->'.join(statuses)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "issue[\"status_path\"] = issue[\"issue_id\"].apply(lambda i_id: StatusPath(i_id, sorted(updates_by_issue[i_id], key=lambda x: x.id)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "plt.rcParams['figure.figsize']=(10,25)\n",
    "by_path = issue.groupby([\"status_path\"]).size()\n",
    "by_path.sort()\n",
    "by_path.plot(kind='barh')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Find distributions of time-to-close for various closed states.\n",
    "\n",
    "closed_issue = issue[issue[\"closed\"] > 0]\n",
    "    \n",
    "closed_issue[\"time_to_close\"] = closed_issue[\"issue_id\"].apply(lambda i_id: issues_by_id[i_id].closed - issues_by_id[i_id].opened)\n",
    "closed_issue[\"issue_state\"] = closed_issue[\"status_id\"].apply(lambda s_id: status_by_id[s_id].status)\n",
    "print(\"Number of closed issues %d\" % closed_issue.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.rcParams['figure.figsize']=(10,5)\n",
    "sns.distplot(closed_issue[closed_issue[\"time_to_close\"] < 1e7][\"time_to_close\"], kde=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# filter for time_to_close < 1e7 (~11 days since timestamps are seconds)\n",
    "# since the time_to_close distribution skews waaaay out\n",
    "sns.boxplot(data=closed_issue, x=\"time_to_close\", y=\"issue_state\", palette=\"colorblind\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
